library(tidyverse)
library(philentropy)

#load(file="/nfs/waterston/files_for_lou/fly_files_for_Angles.RData")
#allExpr <- exp_matrix_fly_mean

angleCosine <- function(v1,v2){
  dp <- v1 %*% v2
  dp <- dp / (sqrt(sum(v1^2))*sqrt(sum(v2^2)))
  return(dp[1,1])
}

# common to FBgn tables
names <- read_tsv(file="/net/waterston/vol9/dm6/r6.49.nameFBgn.tsv")
alias <- read_tsv(file="/net/waterston/vol9/dm6/r6.49.aliasFBgn.tsv")

# read the expression matrix 
allExpr <- read_tsv(file="/net/waterston/vol9/ChipSeqPipeline/fly_pca300_vers3.final.annotation.bootstrapTPM.tsv")
distGeneFBgn <- dplyr::distinct(allExpr,gene,FBgn)
nrow(dplyr::distinct(allExpr,gene))
nrow(dplyr::distinct(allExpr,FBgn))

# across cell types, mean center the expression values
exprMu <- dplyr::group_by(allExpr,gene) %>% dplyr::summarise(mu=mean(TPM))
allExpr <- left_join(allExpr,exprMu,by="gene") %>% mutate(centeredTPM=TPM-mu)
allExpr <- select(allExpr,-TPM,-mu) %>% rename(TPM=centeredTPM)
allExpr <- pivot_wider(allExpr,names_from = "cellType", values_from = "TPM")

# filter the targets by the size of the cluster the peak is part of
fly_TF_targets <- read_tsv(file="/net/waterston/vol6/files_for_lou/fly_TF_targets.tsv")
minPeakCount <- 2
maxPeakCount <- 277
maxPeakCount <- 84
fly_TF_targets <- filter(fly_TF_targets,nPeaks >= minPeakCount)  %>% filter(nPeaks <= maxPeakCount)
fly_TF_targets <- mutate(fly_TF_targets,TF=str_replace_all(TF,"lparen","\\("))
fly_TF_targets <- mutate(fly_TF_targets,TF=str_replace_all(TF,"rparen","\\)"))
fly_TF_targets <- mutate(fly_TF_targets,lifeStage=str_split_i(peakID,"_",3)) # split out the lifeStage from the id

# limit to one target for the tf
fly_TF_targets <- distinct(fly_TF_targets,TF,Gene,.keep_all = TRUE)

# expression for all the tfs
allTFs <- dplyr::distinct(fly_TF_targets,TF)
jn <- left_join(allTFs,names,by=c("TF"="Name")) %>% rename(nameFBgn=FBgn)
jn <- left_join(jn,alias,by=c("TF"="Alias")) %>% rename(aliasFBgn=FBgn)
jn <- mutate(jn,FBgn=if_else(is.na(nameFBgn),aliasFBgn,nameFBgn)) %>% select(-nameFBgn,-aliasFBgn)
allTFexp <- left_join(jn,allExpr,by=c("FBgn"="FBgn")) %>% select(-gene)

jj <- left_join(jn,names) %>% rename(OldName ="TF", NewName="Name")
write_tsv(jj,file="/net/waterston/vol9/ChipSeqPipeline/OldToNewFlyTFNames.tsv")

# filter the peaks for the life stage
stage <- "embryonic"
stageTargets <- filter(fly_TF_targets,lifeStage == stage)
tfs <- distinct(stageTargets,TF)

# expression for all the emb targets
stageTargetGenes <- dplyr::distinct(stageTargets,Gene)
jn <- left_join(stageTargetGenes,names,by=c("Gene"="Name")) %>% rename(nameFBgn=FBgn)
jn <- left_join(jn,alias,by=c("Gene"="Alias")) %>% rename(aliasFBgn=FBgn)
jn <- mutate(jn,FBgn=if_else(is.na(nameFBgn),aliasFBgn,nameFBgn)) %>% select(-nameFBgn,-aliasFBgn)
allTargetExpression <- left_join(jn,allExpr,by=c("FBgn"="FBgn")) %>% select(-gene)
allTargetExpression <- na.omit(allTargetExpression)


#allExpr <- readRDS(file="/net/waterston/vol6/files_for_lou/exp_matrix_PCA300_cell.type.2_no_doubletons_mean.rds" )
#allExpr <- as.data.frame(allExpr)
#allExpr <- mutate(allExpr,gene = rownames(allExpr))

#allExpr <- read_tsv(file="/net/waterston/vol6/files_for_lou/fly_exp_matrix_mean.tsv")
#nrow(distinct(allExpr,gene))
#allExpr <- left_join(allExpr,names,by=c("gene"="Name")) %>% rename(byName=FBgn)
#allExpr <- left_join(allExpr,alias,by=c("gene"="Alias")) %>% rename(byAlias=FBgn)
#allExpr <- mutate(allExpr,byName=if_else(is.na(byName),byAlias,byName)) %>% rename(FBgn=byName) %>% select(-byAlias)

geneFBgn <- select(allExpr,gene,FBgn)
allExpr <- select(allExpr,-gene)

# get the expression of the tfs
tfs <- distinct(stageTargets,TF,tfFBgn)
alltfExpr <- left_join(tfs,allExpr,by=c("tfFBgn"="FBgn") ) %>% select(-TF)

l <- list()
rl <- list()
i <- 100
for (i in 1:nrow(tfs)){
  tf <- tfs$TF[i]
#  tf <- "CG7963"
#  print(c(i,tf))
  
  # get the tpm values for the tf
  tfExpr <- filter(allTFexp,TF==tf)
  tfExpr <-  pivot_longer(tfExpr,cols = 3:ncol(tfExpr),names_to = "cellType",values_to="TF_TPM")  
  
  # get the TPM values for the targets
  targets <- filter(stageTargets,TF==tf)
  targetGenes <- select(targets,"Gene")
  targetExpr <- left_join(targetGenes,allTargetExpression) 
#  targetExpr <- na.omit(targetExpr)
 nTargets <- nrow(targets)
  targetExpr <- pivot_longer(targetExpr,cols = 3:ncol(targetExpr),names_to = "cellType",values_to="targetTPM") 
  
  j <- left_join(targetExpr,tfExpr,by=c("cellType"))
  angles <- group_by(j,Gene) %>% summarize(CosineAngle=angleCosine(targetTPM,TF_TPM)) 
  l[[tf]] <- left_join(angles,targets)
  
  # get the TF expression
#  j <- left_join(targetExpr,tfExpr,by=c("tfFBgn","cellType"))
  
  # compute the cosine angle between the tf and the targets
#  angles <- group_by(j,targetFBgn) %>% summarize(CosineAngle=angleCosine(targetTPM,TF_TPM)) 
  
#  l[[tf]] <- left_join(angles,targets) 
    
  # do the same for random targets
 randomExpr <- sample_n(allExpr,nTargets) %>% dplyr::rename(targetFBgn=FBgn)
 
#  randomExpr <- pivot_longer(randomExpr,cols = 1:ncol(randomExpr)-1,names_to = "cellType",values_to="targetTPM") 
  
 randomExpr <- pivot_longer(randomExpr,cols = 3:ncol(randomExpr),names_to = "cellType",values_to="targetTPM") 
  j <- left_join(randomExpr,tfExpr,by=c("cellType"))
  randomAngles <- group_by(j,gene) %>% summarize(CosineAngle=angleCosine(targetTPM,TF_TPM)) %>% mutate(TF=tf)
  rl[[tf]] <-  mutate(randomAngles,Gene=gene) %>%  select(-gene)
}
a <- bind_rows(l) 
a <- bind_rows(l) %>%  filter(!is.na(CosineAngle))
ra <- bind_rows(rl)  %>%  filter(!is.na(CosineAngle))

g <- dplyr::group_by(a,TF) %>% dplyr::summarise(count=n())

setdiff(dplyr::distinct(a,TF) , dplyr::distinct(aa,TF))
write_tsv(a,file=str_c("/net/waterston/vol6/files_for_lou/fly_TF_TargetAngles_2_84_no_doubletons_2",".tsv",sep=""))
write_tsv(ra,file=str_c("/net/waterston/vol6/files_for_lou/fly_TF_RandomAngles_2_84_no_doubletons_2",".tsv",sep=""))

a <- mutate(a,Source="Target") 
ra <- mutate(ra,Source="Random") 
a <- bind_rows(a,ra)
write_tsv(a,file=str_c("/net/waterston/vol9/ChipSeqPipeline/fly_TF_Angle_2_84_no_doubletons_2",".tsv",sep=""))    


tfs <- distinct(a,TF)

# do a t-test , wilcox and chi square test for each tf

tlist <- list()
i <- 1
for (i in 1:nrow(tfs)) {
  
  tf <- as.character(tfs[i,1])
#  print(c(i,tf))
  targetAngles <- filter(a,TF==tf,Source=="Target")
  randomAngles <- filter(a,TF==tf,Source=="Random")
  if (nrow(targetAngles) > 2 & nrow(randomAngles) > 2){
    
    # do jsd of target vs random angles
    targetHist <- hist(targetAngles$CosineAngle,breaks=seq(-1,1,.1),plot=FALSE)
    targetDist <- targetHist$counts/sum(targetHist$counts)
    randomHist <- hist(randomAngles$CosineAngle,breaks=seq(-1,1,.1),plot=FALSE)
    randomDist <- randomHist$counts/sum(randomHist$counts)
    js <- jensen_shannon(targetDist,randomDist,testNA=FALSE,unit="log")
    
    
    df <- filter(a,TF==tf)
    t <- t.test(CosineAngle ~ Source, data=df)
    w <- wilcox.test(CosineAngle ~ Source, data=df)
    
    q<-quantile(df$CosineAngle , seq(from=0, to=1.0, .07))
    df$outcome_bin <- cut(df$CosineAngle, breaks=q, include.lowest=T)
    tab<-with(df, table(outcome_bin, Source))
    chiS <- chisq.test(tab)
    
    tlist[[tf]] <- tibble(TF=tf,nTargets=nrow(targetAngles),JSD=sqrt(js),wilcoxP=w$p.value,chiP=chiS$p.value,
                          t_test_p=t$p.value,targetMean=t$estimate[2],randomMean=t$estimate[1])  
  }
}
ttest <- bind_rows(tlist)
saveRDS(ttest,str_c("/net/waterston/vol9/ChipSeqPipeline/Fly","_JSD_ChiSquare_ttest_no_doubletons_2_meta84.rds",sep=""))



